home *** CD-ROM | disk | FTP | other *** search
- /*
- PROGRAM NAME: merge.c
-
- PURPOSE: This program is used to merge two separate hierarchies.
- The program first reads each inverted file. It then reads in the
- corresponding link file which gives parent-child links to build
- the hierarchy. It can then perform two different types of mergers:
-
- 1) Simple merge in which a point of connection between the two
- thesauri is made wherever they have terms in common.
-
- 2) Complex merge in which any two terms are connected if they
- have sufficiently (above a specified threshold) similar
- sets of parent and child terms.
-
- INPUT FILES REQUIRED:
-
- 1) inverted file for 1st hierarchy
-
- 2) links file for 1st hierarchy
-
- 3) inverted file for 2nd hierarchy
-
- 4) links file for 2nd hierarchy
-
- An inverted file consists of sequences of
-
- term document number weight
-
- (multiple entries for any term should be grouped together)
-
- A link file consists of sequences of
-
- parent term child term
-
- PARAMETERS TO BE SET BY USER:
-
- 1) MAXWORD: which specifies the maximum size of a term
- 2) THRESHOLD: which specifies the minimum similarity level
- for use in complex merge.
-
- COMMAND LINE:
-
- merge inverted_file_1 link_file_1 inverted_file_2 link_file_2 output_file
-
- ****************************************************************************/
-
- #include <stdio.h>
- #include <string.h>
- #include <math.h>
- #define MAXWORD 20 /* maximum size of a term */
- #define THRESHOLD 0.6 /* similarity threshold for complex_merge */
-
- struct doclist { /* sequences of document # and weight */
- int doc; /* document number */
- float weight; /* term weight in document */
- struct doclist *nextdoc; /* ptr. to next doclist record */
- } doclistfile;
-
- struct parentlist { /* sequences of parent terms */
- char term[MAXWORD]; /* parent term */
- struct invert *parent; /* ptr. to parent term in inverted file */
- struct parentlist *nextparent;/* ptr. to next parentlist record */
- } parentfile;
-
- struct connections { /* holds information about connected terms */
- struct invert *termadd; /* address of connected term in inverted file */
- struct connections *next_connection /* ptr. to next connections record */
- } connectlist;
-
- struct childlist { /* sequences of child terms */
- char term[MAXWORD]; /* child term */
- struct invert *child; /* ptr. to child term in inverted file */
- struct childlist *nextchild; /* ptr. to next childlist record */
- } childfile;
-
- struct invert { /* inverted file */
- char term[MAXWORD]; /* term */
- struct doclist *doc; /* sequences of document # and weight */
- struct parentlist *parents; /* pointer to list of parent terms */
- struct childlist *children; /* pointer to list of children terms */
- struct connections *connect; /* pointer to connection in other hierarchy */
- struct invert *nextterm; /* ptr. to next invert record */
- } invfile;
-
- static struct invert *startinv; /* ptr. to first record in inverted file */
- static struct invert *lastinv; /* ptr. to last record in inverted file */
- static struct doclist *lastdoc; /* ptr. to last document in doclist */
- static struct invert *start_inv1; /* ptr. to the start of 1st inverted file */
- static struct invert *start_inv2; /* ptr. to the start of 2nd inverted file */
-
- static FILE *input1; /* first inverted file */
- static FILE *input2; /* first link file */
- static FILE *input3; /* second inverted file */
- static FILE *input4; /* second link file */
- static FILE *output; /* holds any outputs */
-
- static char currentterm[MAXWORD]; /* tracks current term in inverted file */
-
- static struct invert *get_mem_invert(); /* these four get_mem functions */
- static struct doclist *get_mem_doclist(); /* obtain memory dynamically for */
- static struct parentlist *get_mem_parentlist();/* storing different types of */
- static struct childlist *get_mem_childlist(); /* records. */
- static struct connections *get_mem_connections();
-
- static struct invert *find_term(); /* searches for term in inverted file and */
- /* returns address of the term */
- static int compare();
-
- static
- void initialize(), /* initialize global variables */
- open_files(), /* open files */
- read_invfile(), /* read in the inverted file */
- add_invert(), /* called within read_invfile() */
- read_links(), /* read in the links information */
- add_link(), /* called within read_links() */
- pr_invert(), /* print the inverted file */
- simple_merge(), /* simple merge between both hierarchies*/
- complex_merge(); /* complex merge between hierarchies */
-
- int main(argc,argv)
- int argc;
- char *argv[];
- {
-
- char ch;
-
- if (argc!=6)
- {
- (void) printf("There is an error in the command line\n");
- (void) printf("Correct usage is:\n");
- (void) printf("merge inverted_file_1 link_file_1 inverted_file_2 link_file_2 output_file\n");
- exit(1);
- }
-
- start_inv1 = NULL; start_inv2 = NULL; /* initialize start of both inverted files */
-
- initialize();
- open_files(argv);
- (void) fprintf(output,"\nREADING FIRST INVERTED FILE\n");
- read_invfile(input1);
- start_inv1 = startinv;
- (void) fprintf(output,"\nREADING FIRST LINK FILE\n");
- read_links(input2,start_inv1);
- (void) fprintf(output,"\nPRINTING FIRST INVERTED FILE\n\n");
- pr_invert(start_inv1);
- /* re-initialize */
- initialize();
- (void) fprintf(output,"\nREADING SECOND INVERTED FILE\n");
- read_invfile(input3);
- start_inv2 = startinv;
- (void) fprintf(output,"\nREADING SECOND LINK FILE\n");
- read_links(input4,start_inv2);
- (void) fprintf(output,"\nPRINTING SECOND INVERTED FILE\n\n");
- pr_invert(start_inv2);
-
- (void) printf("Make a selection\n");
- (void) printf("To use the simple_merge algorithm, enter 1\n");
- (void) printf("To use the complex_merge algorithm, enter 2\n");
- (void) printf("\nEnter selection: ");
-
- ch = getchar();
-
- switch(ch)
- {
- case '1':
- (void) fprintf(output,"\nPERFORMING A SIMPLE MERGE OF THE TWO INVERTED FILES\n");
- simple_merge(start_inv1,start_inv2);
- (void) fprintf(output,"\nPRINTING FIRST INVERTED FILE AFTER SIMPLE MERGE\n\n");
- pr_invert(start_inv1);
- (void) fprintf(output,"\nPRINTING SECOND INVERTED FILE AFTER SIMPLE MERGE\n\n");
- pr_invert(start_inv2);
- break;
- case '2':
- (void) fprintf(output,"\nPERFORMING A COMPLEX MERGE OF THE TWO INVERTED FILES\n");
- complex_merge(start_inv1,start_inv2);
- (void) fprintf(output,"\nPRINTING FIRST INVERTED FILE AFTER COMPLEX MERGE\n\n");
- pr_invert(start_inv1);
- (void) fprintf(output,"\nPRINTING SECOND INVERTED FILE AFTER COMPLEX MERGE\n\n");
- pr_invert(start_inv2);
- }
-
- (void) fclose(input1);(void) fclose(input2);
- (void) fclose(input3);(void) fclose(input4);(void) fclose(output);
-
- return(0);
-
- }
-
- /***************************************************************************
-
- open_files(argv)
-
- Returns: void
-
- Purpose: Open all input & output files
-
- **/
-
- static void open_files(argv)
- char *argv[];
- {
- if (( input1 = fopen(argv[1],"r")) == NULL ) {
- (void) printf("couldn't open file %s\n",argv[1]);
- exit(1); /* inverted file for first thesaurus hierarchy */
- }
-
- if (( input2 = fopen(argv[2],"r")) == NULL ) {
- (void) printf("couldn't open file %s\n",argv[2]);
- exit(1); /* link file for first thesaurus hierarchy */
- }
-
- if (( input3 = fopen(argv[3],"r")) == NULL ) {
- (void) printf("couldn't open file %s\n",argv[3]);
- exit(1); /* inverted file for second thesaurus hierarchy */
- }
-
- if (( input4 = fopen(argv[4],"r")) == NULL ) {
- (void) printf("couldn't open file %s\n",argv[4]);
- exit(1); /* link file for second thesaurus hierarchy */
- }
-
- if (( output = fopen(argv[5],"w")) == NULL) {
- (void) printf("couldn't open file %s for output \n",argv[5]);
- exit(1); /* output file */
- }
-
- }
-
- /***************************************************************************
-
- initialize()
-
- Returns: void
-
- Purpose: Initialize global variables
-
- **/
-
- static void initialize()
- {
- startinv = NULL; /* start of inverted file */
- lastinv = NULL; /* end of inverted file */
- lastdoc = NULL; /* last document considered */
- currentterm[0] = '\0'; /* current term being considered */
-
- }
-
- /***************************************************************************
-
- read_invfile(input)
-
- Returns: void
-
- Purpose: Read the inverted file from a disk file
-
- **/
-
- static void read_invfile(input)
- FILE *input;
- {
- int docid; /* holds current document number */
- char temp[MAXWORD]; /* holds current term */
- float weight; /* holds current term weight */
- struct doclist *p; /* structure to hold document numner-weight pair */
-
- (void) fscanf(input,"%s%d%f",temp,&docid,&weight); /* read next line */
- while (strlen(temp) > 0) /* while a legitimate line */
- {
- if (!strncmp(currentterm,temp,MAXWORD)) {
- /* if temp is the same as current term then simply add next document-weight info*/
- p = get_mem_doclist();
- p->doc = docid; /* assign doc. number */
- p->weight = weight; /* assign doc. weight */
- p->nextdoc = NULL;
- if (lastdoc) lastdoc->nextdoc = p; /* connect p to doclist chain for this term */
- lastdoc = p; /* set this global variable */
- }
- else add_invert(docid,temp,weight); /* temp not the same as current term, hence */
- temp[0] = '\0'; /* start a new entry in the inverted file */
- (void) fscanf(input,"%s%d%f",temp,&docid,&weight); /* read next input line */
- }
- }
-
- /***************************************************************************
-
- add_invert(docid,temp,weight)
-
- Returns: void
-
- Purpose: Called in read_invfile when a new term is being read from the file.
- Starts a new entry in the inverted file.
-
- **/
-
- static void add_invert(docid,temp,weight)
- int docid; /* in: document number */
- char temp[MAXWORD]; /* in: new index term */
- float weight; /* in: index term weight */
- {
- struct invert *p; /* structure p will be attached to inv. file */
-
- p = get_mem_invert(); /* get memory for p */
- (void) strncpy(p->term,temp,MAXWORD); /* copy over the term */
- p->parents = NULL; /* initially term has no parents */
- p->children = NULL; /* also no children terms */
- p->doc = get_mem_doclist(); /* get memory for a doclist structure */
- p->doc->doc = docid; /* assign the document number */
- p->doc->weight = weight; /* assign term weight */
- p->doc->nextdoc = NULL;
- p->connect = NULL; /* initially this term not connected to any */
- /* other in any other hierarchy */
- p->nextterm = NULL;
- if (startinv == NULL) startinv = p; /* if this is the 1st term in inverted file */
- if (lastinv) lastinv->nextterm = p; /* always update lastinv pointer */
- lastinv = p;
- lastdoc = p->doc;
- (void) strncpy(currentterm,temp,MAXWORD); /* update the value of currentterm to the */
- /* new term that has just been read */
- }
-
- /***************************************************************************
-
- read_links(input,startinv)
-
- Returns: void
-
- Purpose: Add the link information to the inverted file
-
- **/
-
- static void read_links(input,startinv)
- FILE *input; /* in: input file */
- struct invert *startinv; /* in: start of this inverted file */
- {
- char parent[MAXWORD], /* holds parent term */
- child[MAXWORD]; /* holds child term */
-
- (void) fscanf(input,"%s%s",parent,child); /* read first input line */
- while (strlen(parent) > 0 && strlen(child) > 0) /* while non-trivial input */
- {
- if (!find_term(parent,startinv) || !find_term(child,startinv))
- {
- (void) printf("Please check your input files\n");
- (void) printf("Term %s or term %s is not in inverted file\n",parent,child);
- exit(0);
- }
- add_link(parent,child,startinv); /* this function makes links */
- child[0] = '\0'; parent[0] = '\0'; /* throw out old parent & child info*/
- (void) fscanf(input,"%s%s",parent,child); /* read next line */
- }
- }
-
- /***************************************************************************
-
- add_link(parent,chile,startinv)
-
- Returns: void
-
- Purpose: Function is used within read_links
-
- **/
-
- static void add_link(parent,child,startinv)
- char parent[MAXWORD], /* in: specify parent term */
- child[MAXWORD]; /* in: specify child term */
- struct invert *startinv; /* in: specify start of this inv. file */
- {
- struct invert *p, *q; /* holds adds. of both terms in inv. file */
- struct parentlist *new_parent; /* structure to hold new parent info. */
- struct childlist *new_child; /* structure to hold new child info. */
-
- p = find_term(parent, startinv); /* find address of parent & child*/
- q = find_term(child, startinv); /* terms in the inverted file */
-
- /* first add parent links for the given child */
-
- new_parent = get_mem_parentlist(); /* get memory for parent record */
- (void) strncpy(new_parent->term,parent,MAXWORD); /* copy over parent term */
- new_parent->parent = p; /* store addr. (in inverted file) of parent term*/
- if (q->parents == NULL) { /* i.e. no parents listed for this child yet */
- q->parents = new_parent; /* first parent link made */
- new_parent->nextparent = NULL;
- }
- else { /* at least 1 parent already listed for given child */
- new_parent->nextparent = q->parents; /* attach new parent in front of list */
- q->parents = new_parent;
- }
-
- /* next add child links for given parent */
-
- new_child = get_mem_childlist(); /* get memory for child record */
- (void) strncpy(new_child->term,child,MAXWORD); /* copy over child term */
- new_child->child = q; /* store addr. (in inverted file) of child term */
- if (p->children == NULL) { /* i.e. no child terms listed for this parent*/
- p->children = new_child; /* first child link made */
- new_child->nextchild = NULL;
- }
- else { /* at least 1 child already exists for given parent */
- new_child->nextchild = p->children;/* attach new child to front of list */
- p->children = new_child;
- }
- }
-
- /***************************************************************************
-
- pr_invert(startinv)
-
- Returns: void
-
- Purpose: Print either inverted file. Prints each term, its
- associated document numbers, term-weights and parent
- and child terms.
-
- **/
-
- static void pr_invert(startinv)
- struct invert *startinv; /* in: specifies start of inverted file */
- {
- struct invert *inv_addr; /* tracks add. of current inv. file record */
- struct doclist *doc_addr; /* tracks add. of current doclist record */
- struct parentlist *parent_addr; /* tracks add. of current parentlist record*/
- struct childlist *child_addr; /* tracks add. of current childlist record */
- struct connections *connect_term_add; /* tracks connected terms */
-
- inv_addr = startinv; /* begin at top of inv. file */
- while (inv_addr) { /* while a legitimate term.... */
- (void) fprintf(output,"TERM: %s\nPARENT TERMS: ",inv_addr->term);
- parent_addr = inv_addr->parents; /* find addr. of first parent */
- while (parent_addr) { /* printing all parents */
- (void) fprintf(output,"%s",parent_addr->term);
- parent_addr = parent_addr->nextparent; /* loop through remaining parents */
- if(parent_addr) (void) fprintf(output,", ");
- }
- (void) fprintf(output,"\nCHILD TERMS: ");
- child_addr = inv_addr->children; /* find addr. of first child */
- while (child_addr) { /* printing all children */
- (void) fprintf(output,"%s",child_addr->term);
- child_addr = child_addr->nextchild; /* loop through remaining childrend */
- if(child_addr) (void) fprintf(output,", ");
- }
- (void) fprintf(output,"\nDOCUMENT NUMBER TERM WEIGHT\n");
- doc_addr = inv_addr->doc; /* find addr. of first associated doc. */
- while (doc_addr) { /* printing all documents */
- (void) fprintf(output,"%-30d ",doc_addr->doc);
- (void) fprintf(output,"%-10.5f\n",doc_addr->weight);
- doc_addr = doc_addr->nextdoc; /* loop through remaining docs. */
- }
- /* if the terms is connected then print the term from the other hierarchy */
- (void) fprintf(output,"CONNECTIONS IN OTHER THESAURUS HIERARCHY:\n");
- connect_term_add = inv_addr->connect;
- while (connect_term_add)
- {
- (void) fprintf(output," %s",connect_term_add->termadd->term);
- connect_term_add = connect_term_add->next_connection;
- if(connect_term_add) (void) fprintf(output,", ");
- }
- (void) fprintf(output,"\n\n");
- inv_addr = inv_addr->nextterm; /* loop to next term in inverted file */
- }
- }
-
- /***************************************************************************
-
- simple_merge(startinv1,startinv2)
-
- Returns: void
-
- Purpose: In this function, two terms in different hierarchies are merged
- if they are identical.
-
- **/
-
- static void simple_merge(startinv1, startinv2)
- struct invert *startinv1, /* in: specifies start of 1st inv. file */
- *startinv2; /* in: specifies start of 2nd inv. file */
- {
- struct invert *inv_addr1, *inv_addr2;
- struct connections *r1, *r2; /* storage to hold info. about connected terms */
-
- inv_addr1 = startinv1; /* start with top of 1st inv. file */
- while(inv_addr1) { /* looking for this term in the other inv. file */
- inv_addr2 = find_term(inv_addr1->term, startinv2);
- if (inv_addr2) { /* if term was found then update connect */
- r1 = get_mem_connections();
- r1->termadd = inv_addr2;
- r1->next_connection = inv_addr1->connect;
- inv_addr1->connect = r1;
- r2 = get_mem_connections();
- r2->termadd = inv_addr1;
- r2->next_connection = inv_addr2->connect;
- inv_addr2->connect = r2;
- }
- inv_addr1 = inv_addr1->nextterm;
- }
- }
-
- /***************************************************************************
-
- complex_merge(startinv1,startinv2)
-
- Returns: void
-
- Purpose: In this routine any two terms in different hierarchies are merged
- if they have 'similar' parents and children. Similarity is
- computed and compared to a pre-fixed user specified THRESHOLD
-
- **/
-
- static void complex_merge(startinv1, startinv2)
- struct invert *startinv1, /* in: specifies start of 1st inv. file */
- *startinv2; /* in: specifies start of 2nd inv. file */
- {
- struct invert *inv1_addr, /* tracks current term in 1st inv. file */
- *inv2_addr; /* tracks current term in 2nd inv. file */
- struct connections *r1, *r2; /* tracks connected terms */
- int compare();
-
- inv1_addr = startinv1; /* begin at top of 1st inv. file */
- while(inv1_addr) { /* while addr. legitimate .... */
- inv2_addr = startinv2; /* now begin at top of 2nd inv. file */
- while(inv2_addr) {
- if (compare(inv1_addr,inv2_addr)) { /* this returns 1 of 2 terms are */
- /* similar enough, then connect the two terms */
- r1 = get_mem_connections();
- r1->termadd = inv2_addr;
- r1->next_connection = inv1_addr->connect;
- inv1_addr->connect = r1;
- r2 = get_mem_connections();
- r2->termadd = inv1_addr;
- r2->next_connection = inv2_addr->connect;
- inv2_addr->connect = r2;
- }
- inv2_addr = inv2_addr->nextterm; /* loop through 2nd inv. file */
- }
- inv1_addr = inv1_addr->nextterm; /* loop through 1st inv. file */
- }
- }
-
- /***************************************************************************
-
- compare(p,q)
-
- Returns: int
-
- Purpose: Used to compare two terms for more than just equality.
- A similarity value is computed and if it is greater than a
- THRESHOLD then 1 is returned, else 0
-
- **/
-
- static int compare(p,q)
- struct invert *p, *q; /* addresses of two terms to be compared */
- {
- struct parentlist *parent1, /* tracks parentlist of 1st term */
- *parent2; /* tracks parentlist of 2nd term */
- struct childlist *child1, /* tracks childlist of 1st term */
- *child2; /* trakcs childlist of 2nd term */
- float count1, /* tracks # of parents + children of 1st term */
- count2, /* tracks # of parents + children of 2nd term */
- count; /* tracks # of common parents + children */
-
- count = 0.0; count1 = 0.0; count2 = 0.0; /* initialize all counts */
-
- /* first check # of parents for p->term & the # of common parents */
-
- parent1 = p->parents;
- while(parent1) { /* parent of 1st term */
- count1 = count1 + 1.0;
- parent2 = q->parents;
- while(parent2) { /* loop through parents of second term */
- if(!strncmp(parent1->term,parent2->term,MAXWORD)) count = count + 1.0;
- parent2 = parent2->nextparent;
- }
- parent1 = parent1->nextparent;
- }
-
- /* next compute # of parents for q->term */
-
- parent2 = q->parents;
- while (parent2) { /* loop through parents of 2nd term again */
- count2 = count2 + 1.0;
- parent2 = parent2->nextparent;
- }
-
- /* now check # of children for p->term & the # of common children */
-
- child1 = p->children;
- while(child1) { /* loop through children of 1st term */
- count1 = count1 + 1.0;
- child2 = q->children;
- while(child2) { /* loop through children of 2nd term */
- if (!strncmp(child1->term,child2->term,MAXWORD)) count = count + 1.0;
- child2 = child2->nextchild;
- }
- child1 = child1->nextchild;
- }
-
- /* next compute # of children for q->term */
-
- child2 = q->children;
- while (child2) { /* loop through children of 2nd term */
- count2 = count2 + 1.0;
- child2 = child2->nextchild;
- }
-
- if (count != 0.0) { /* if there is anything in common at all */
- if ((count/(sqrt(count1 * count2))) >= THRESHOLD) {
- /* printf("value is %f\n",(count/(sqrt(count1*count2)))); */
- return(1);
- }
- else return(0);
- }
- return(0);
- }
-
- /***************************************************************************
-
- find_term(term,startinv)
-
- Returns: address of a struct invert record
-
- Purpose: Search for a specified term in the specified inverted file
- and return address of the corresponding record. If not
- found then returns NULL.
-
- **/
-
- static struct invert *find_term(term, startinv)
- char term[MAXWORD]; /* in: term to be searched */
- struct invert *startinv; /* in: inverted file to search in */
- {
- struct invert *inv_addr;
-
- inv_addr = startinv; /* begin at top of inverted file */
- while(inv_addr) {
- if (!strcmp(term,inv_addr->term)) return(inv_addr);
- inv_addr = inv_addr->nextterm;
- }
- return(NULL);
- }
-
- /***************************************************************************
-
- *get_mem_invert()
-
- Returns: address of a struct invert record
-
- Purpose: dynamically obtain enough memory to store 1 invert record
-
- **/
-
- static struct invert *get_mem_invert()
- {
- struct invert *record;
-
- record = (struct invert *)malloc(sizeof(invfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- return(NULL);
- }
- return(record);
- }
-
- /***************************************************************************
-
- *get_mem_doclist()
-
- Returns: address of a struct doclist record
-
- Purpose: dynamically obtain enough memory to store 1 doclist record
-
- **/
-
- static struct doclist *get_mem_doclist()
- {
- struct doclist *record;
-
- record = (struct doclist *)malloc(sizeof(doclistfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- return(NULL);
- }
- return(record);
- }
-
- /***************************************************************************
-
- *get_mem_parentlist()
-
- Returns: address of a struct parentlist record
-
- Purpose: dynamically obtain enough memory to store 1 parentlist record.
-
- **/
-
- static struct parentlist *get_mem_parentlist()
- {
- struct parentlist *record;
-
- record = (struct parentlist *)malloc(sizeof(parentfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- return(NULL);
- }
- return(record);
- }
-
- /***************************************************************************
-
- *get_mem_childlst()
-
- Returns: address of a struct childlist record
-
- Purpose: dynamically obtain enough memory to store 1 childlist record.
-
- **/
-
- static struct childlist *get_mem_childlist()
- {
- struct childlist *record;
-
- record = (struct childlist *)malloc(sizeof(childfile));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- return(NULL);
- }
- return(record);
- }
-
-
- /***************************************************************************
-
- *get_mem_connections()
-
- Returns: address of a struct connections record
-
- Purpose: dynamically obtain enough memory to store 1 connections record.
-
- **/
-
- static struct connections *get_mem_connections()
- {
- struct connections *record;
-
- record = (struct connections *)malloc(sizeof(connectlist));
- if (!record) {
- (void) fprintf(output,"\nout of memory\n");
- return(NULL);
- }
- return(record);
- }
-
-